# This Code calculates the centralities of the network and the average shortest path length
# FILE INPUTS:
# 1. SCnetwork2.dta
# 2. vtnic.dta

# FILE OUTPUTS:
# 1. centralities_full_network_1994_2019.dta
# 2. centralities_vtnic_1989_2019.dta

import pandas as pd
import networkx as net
import numpy as np
import matplotlib.pyplot as plt

# from pandasql import sqldf
import concurrent.futures
import os

os.chdir("your_path")

# -------------- Calculating Centralities -------------- #

# -------  Calculating centrality for the entire network  
df_full = pd.read_stata('SCnetwork2.dta')
centralities = pd.DataFrame(columns=[
    'gvkey', 'degree_num', 'degree', 
    'fyear', 'source'
])

df_full = df_full[(df_full['fyear'] >= 1994) & (df_full['fyear'] <= 2020)]
results = []
for year, df_year in df_full.groupby('fyear'):
    # Overall network measures (source = 0)
    df_temp = df_year.drop(columns=['negscore','mycount','pairid','missingscore'])
    network_overall = net.from_pandas_edgelist(df_temp, 'gvkey1', 'gvkey2', 'score', create_using=net.DiGraph)
    overall = pd.DataFrame({
        'degree': {n: v * 100 for n, v in net.degree_centrality(network_overall).items()},
        'clustering': {n: v * 100 for n, v in net.clustering(network_overall).items()},
        'closeness': {n: v * 100 for n, v in net.closeness_centrality(network_overall).items()}
    }).rename_axis('gvkey').reset_index()
    overall['fyear'] = year
    overall['source'] = 0  # 0 indicates overall network (all sources)
    results.append(overall)
    # Separate measures for each source 1, 2, and 3.
    for s in [1, 2, 3]:
        df_source = df_year[df_year['source'] == s]
        if not df_source.empty:
            df_temp_s = df_source.drop(columns=['negscore','mycount','pairid','missingscore'])
            network_s = net.from_pandas_edgelist(df_temp_s, 'gvkey1', 'gvkey2', 'score', create_using=net.DiGraph)
            source_df = pd.DataFrame({
                'degree': {n: v * 100 for n, v in net.degree_centrality(network_s).items()},
                'clustering': {n: v * 100 for n, v in net.clustering(network_s).items()},
                'closeness': {n: v * 100 for n, v in net.closeness_centrality(network_s).items()}
            }).rename_axis('gvkey').reset_index()
            source_df['fyear'] = year
            source_df['source'] = s
            results.append(source_df)
centralities = pd.concat(results, ignore_index=True)

# Export the dataset to the main folder:
centralities.to_stata("centralities_full_network_1994_2019.dta")


### -------- SHORTEST PATH -------- ###
# Note:: TAKE A lot of time to run!
# This code does not work if your entire graph contains non-connected components
# Find the largest network 
spath = []
for year in range(2003, 2021): # Does not include 2021!
    print(year)
    df_temp = df_full[df_full['fyear']==year]
    df_temp = df_temp.drop(columns=['negscore','mycount', 'pairid','missingscore'])
    network_year = net.from_pandas_edgelist(df_temp, 'gvkey1', 'gvkey2', 'score', create_using=net.DiGraph)

    # Generate connected components and select the largest:
    largest_component = max(net.strongly_connected_components(network_year), key=len)
    # Create a subgraph of G consisting only of this component:
    network_year2 = network_year.subgraph(largest_component)
    # Calculate the avg shortest path
    avg_shortest_path_length = net.average_shortest_path_length(network_year2)
    print(year)    
    print(avg_shortest_path_length)
    spath.append(avg_shortest_path_length)

print(np.hstack(spath))
spath2 = pd.DataFrame(spath, columns=['shortest_path'])
print(spath2)
